#Set my seed
## [1] "/local/workdir/mld253/git_repos/comp_micro_proj"
#Testing timing of Rmd document knitting
## [1] "2024-03-17 11:57:19 EDT"
#Load libraries
library(pacman)
pacman::p_load(tidyverse, BiocManager, devtools, dada2,
phyloseq, patchwork, DT, iNEXT, vegan, ggplot2, cowplot,
install = FALSE)
library(patchwork)## [1] "/local/workdir/mld253/git_repos/comp_micro_proj"
raw_fastqs_path <- "ncbi_data/fastq_files"
#raw_fastqs_path
#head(list.files(raw_fastqs_path))
#str(list.files(raw_fastqs_path))
forward_reads <- list.files(raw_fastqs_path, pattern="_1.fastq.gz", full.names = TRUE)
#head(forward_reads)
#str(forward_reads)
reverse_reads <- list.files(raw_fastqs_path, pattern="_2.fastq.gz", full.names = TRUE)
#head(reverse_reads)## [1] 72 1 40 57 50 35 65 56 52 44 46 66
forward_raw_plot_12 <- plotQualityProfile(forward_reads[random_samples]) +
labs(title="Forward Reads Raw Quality")
reverse_raw_plot_12 <- plotQualityProfile(reverse_reads[random_samples]) +
labs(title="Reverse Reads Raw Quality")
#ggplot2/patchwork was having difficulty combining plots, I am not entirely certain why. chatGPT suggested installing cowplot. and it worked better
library(cowplot)
# Combine plots using cowplot
combined_plots <- plot_grid(forward_raw_plot_12, reverse_raw_plot_12, ncol = 2)
# Display the combined plots
print(combined_plots)forward_raw_aggregated_plot <- plotQualityProfile(forward_reads, aggregate = TRUE) + labs(title="Aggregated Raw Fwd Reads")
reverse_raw_aggregated_plot <- plotQualityProfile(reverse_reads, aggregate=TRUE) + labs(title= "Aggregated Raw Rev Reads")
plot_grid(forward_raw_aggregated_plot, reverse_raw_aggregated_plot, ncol=2)preqc_aggregated_plot <- plot_grid(forward_raw_aggregated_plot, reverse_raw_aggregated_plot, ncol=2)There is a steep drop off in quality near the end of each reads, as expected. The reverse reads are also generally lower in quality, but this was also expected.
There is also a small number of bases at the very beginning of each read that has a consistently low quality. This needs to be trimmed out.
Some of the ends of the raw reads also have uncomfortably low quality.. around even a quality score of 10. We will see what happens further downstream with the analysis!
## [1] "SRR19625066" "SRR19625067" "SRR19625068" "SRR19625069" "SRR19625070"
## [6] "SRR19625071"
## [1] "ncbi_data/filtered_fastqs"
filtered_forward_reads <-file.path(filtered_fastqs_path, paste0(samples,"_R1_filtered.fastq.gz"))
length(filtered_forward_reads)## [1] 72
filtered_reverse_reads <-file.path(filtered_fastqs_path, paste0(samples, "_R2_filtered.fastq.gz"))
length(filtered_reverse_reads)## [1] 72
## [1] "ncbi_data/filtered_fastqs/SRR19625066_R2_filtered.fastq.gz"
## [2] "ncbi_data/filtered_fastqs/SRR19625067_R2_filtered.fastq.gz"
## [3] "ncbi_data/filtered_fastqs/SRR19625068_R2_filtered.fastq.gz"
## [4] "ncbi_data/filtered_fastqs/SRR19625069_R2_filtered.fastq.gz"
## [5] "ncbi_data/filtered_fastqs/SRR19625070_R2_filtered.fastq.gz"
## [6] "ncbi_data/filtered_fastqs/SRR19625071_R2_filtered.fastq.gz"
First trimming runthrough, with “tight” trimming parameters for this data
filtered_reads <- filterAndTrim(fwd=forward_reads,
filt=filtered_forward_reads,
rev = reverse_reads,
filt.rev=filtered_reverse_reads,
maxN=0,
maxEE=c(2,2),
trimLeft=c(19,20),
truncQ=2,
rm.phix = TRUE,
compress=TRUE,
multithread=TRUE)forward_filteredQual_plot_12 <-
plotQualityProfile(filtered_forward_reads[random_samples]) +
labs(title = "Trimmed Forward Read Quality")
reverse_filteredQual_plot_12 <-
plotQualityProfile(filtered_reverse_reads[random_samples]) +
labs(title = "Trimmed Reverse Read Quality")
# Put the two plots together , using cowplot
plot_grid(forward_filteredQual_plot_12, reverse_filteredQual_plot_12, ncol=2)#Aggregate Trimmed Plots
forward_QC_plot <- plotQualityProfile(filtered_forward_reads, aggregate=TRUE) + labs(title="Filtered Aggregated Fwd Reads")
reverse_QC_plot <- plotQualityProfile(filtered_reverse_reads,aggregate=TRUE) + labs(title="Filtered Aggregated Rev Reads")
#Plot together with cowplot again
plot_grid(forward_QC_plot, reverse_QC_plot, ncol=2)Steep quality dropoff at the beginning of the reads is gone - successfully trimmed start of the reads, and primer sequences are removed
## reads.in reads.out
## SRR19625066_1.fastq.gz 77543 33811
## SRR19625067_1.fastq.gz 73819 30748
## SRR19625068_1.fastq.gz 248353 99890
## SRR19625069_1.fastq.gz 169479 67389
## SRR19625070_1.fastq.gz 191391 86342
## SRR19625071_1.fastq.gz 131759 58224
filtered_df %>%
reframe(median_reads_in = median(reads.in),
median_reads_out = median(reads.out),
median_percent_retained=(median(reads.out)/median(reads.in)))## median_reads_in median_reads_out median_percent_retained
## 1 119394 49787.5 0.4170017
That filtered out a LARGE NUMBER of reads!! Only 41% retained! Maybe further quality filtering is not necessary!!
I will try again with slightly lower error filtering
maxEE=2,3 so that we can compare both outputs.
#Filtered reads path placeholder
filtered_fastqs_2_path <- "ncbi_data/filtered_fastqs_lower_threshold"
#Filtered reads vectors
filtered_forward_reads_2 <-file.path(filtered_fastqs_2_path, paste0(samples,"_R1_filtered.fastq.gz"))
length(filtered_forward_reads)## [1] 72
filtered_reverse_reads_2 <-file.path(filtered_fastqs_2_path, paste0(samples, "_R2_filtered.fastq.gz"))
#Trimming with lower paramaters
filtered_reads_2 <- filterAndTrim(fwd=forward_reads,
filt=filtered_forward_reads_2,
rev = reverse_reads,
filt.rev=filtered_reverse_reads_2,
maxN=0,
maxEE=c(2,3),
# Reverse reads were lower in quality - perhaps increasing the expected errors will increase retained reads
#This is the only parameter that has changed
trimLeft=c(19,20),
truncLen = c(240,220),
truncQ=2,
rm.phix = TRUE,
compress=TRUE,
multithread=TRUE)forward_filtered_qual_2 <- plotQualityProfile(filtered_forward_reads_2[random_samples]) + labs(title="Trimmed Forward Read Quality, Higher Error Rate")
reverse_filtered_qual_2 <- plotQualityProfile(filtered_reverse_reads_2[random_samples]) + labs(title="Trimmed Reverse Read Quality, Higher Error")
#Plot together with cowplot
plot_grid(forward_filtered_qual_2, reverse_filtered_qual_2, ncol=2)#Aggregated Trimmed Plots 2
forward_QC_plot_2 <-plotQualityProfile(filtered_forward_reads_2, aggregate=TRUE) + labs(title="Filtered Aggregated Forward Reads")
reverse_QC_plot_2 <- plotQualityProfile(filtered_reverse_reads_2, aggregate=TRUE) + labs(title="Filtered Aggregated Reverse Reads")
#View the plots
postqc_aggregated_plot <- plot_grid(forward_QC_plot_2, reverse_QC_plot_2, ncol=2)
postqc_aggregated_plot## reads.in reads.out
## SRR19625066_1.fastq.gz 77543 71046
## SRR19625067_1.fastq.gz 73819 66972
## SRR19625068_1.fastq.gz 248353 225428
## SRR19625069_1.fastq.gz 169479 152947
## SRR19625070_1.fastq.gz 191391 175990
## SRR19625071_1.fastq.gz 131759 120852
filtered_df %>%
reframe(median_reads_in = median(reads.in),
median_reads_out = median(reads.out),
median_percent_retained=(median(reads.out)/median(reads.in)))## median_reads_in median_reads_out median_percent_retained
## 1 119394 108216 0.9063772
Lower error parameters: Higher percent retained (59% vs 41% retained) and quality graph looks generally the same.
Will move forward with 59% retained reads, because there is some degree of error correction during dada2 pipeline. May revisit later and look at what happens with NO trimming step at all.
plot_grid(forward_raw_aggregated_plot,
reverse_raw_aggregated_plot,
forward_QC_plot_2,
reverse_QC_plot_2,
align="v",
nrow=2)## 114122853 total bases in 516393 reads from 4 samples will be used for learning the error rates.
## 103278600 total bases in 516393 reads from 4 samples will be used for learning the error rates.
forward_error_plot <- plotErrors(error_forward_reads, nominalQ=TRUE) +
labs(title="Forward Reads Error Model")
reverse_error_plot <- plotErrors(error_reverse_reads, nominalQ=TRUE) +
labs(title="Reverse Reads Error Model")
plot_grid(forward_error_plot,reverse_error_plot, ncol=2)## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
I find it difficult to interpret this, but generally error rates drop with increased Phred score.
#forward and reverse sequences
dada_forward <-dada(filtered_forward_reads_2,
err=error_forward_reads,
multithread=TRUE)## Sample 1 - 71046 reads in 35220 unique sequences.
## Sample 2 - 66972 reads in 29411 unique sequences.
## Sample 3 - 225428 reads in 80179 unique sequences.
## Sample 4 - 152947 reads in 56234 unique sequences.
## Sample 5 - 175990 reads in 64914 unique sequences.
## Sample 6 - 120852 reads in 48013 unique sequences.
## Sample 7 - 158938 reads in 72618 unique sequences.
## Sample 8 - 103350 reads in 49815 unique sequences.
## Sample 9 - 85364 reads in 40509 unique sequences.
## Sample 10 - 150286 reads in 60942 unique sequences.
## Sample 11 - 162646 reads in 60577 unique sequences.
## Sample 12 - 104183 reads in 41466 unique sequences.
## Sample 13 - 144019 reads in 56623 unique sequences.
## Sample 14 - 64744 reads in 27776 unique sequences.
## Sample 15 - 55649 reads in 25435 unique sequences.
## Sample 16 - 136356 reads in 56766 unique sequences.
## Sample 17 - 166271 reads in 62123 unique sequences.
## Sample 18 - 71408 reads in 30684 unique sequences.
## Sample 19 - 197771 reads in 64990 unique sequences.
## Sample 20 - 78170 reads in 34665 unique sequences.
## Sample 21 - 142121 reads in 51328 unique sequences.
## Sample 22 - 123771 reads in 50591 unique sequences.
## Sample 23 - 96872 reads in 48131 unique sequences.
## Sample 24 - 116031 reads in 43951 unique sequences.
## Sample 25 - 116883 reads in 51168 unique sequences.
## Sample 26 - 91426 reads in 41789 unique sequences.
## Sample 27 - 95720 reads in 39510 unique sequences.
## Sample 28 - 105899 reads in 44980 unique sequences.
## Sample 29 - 108872 reads in 49441 unique sequences.
## Sample 30 - 118042 reads in 50403 unique sequences.
## Sample 31 - 29016 reads in 14993 unique sequences.
## Sample 32 - 76786 reads in 37822 unique sequences.
## Sample 33 - 66922 reads in 35195 unique sequences.
## Sample 34 - 90321 reads in 41234 unique sequences.
## Sample 35 - 69390 reads in 33345 unique sequences.
## Sample 36 - 36454 reads in 21184 unique sequences.
## Sample 37 - 115333 reads in 50430 unique sequences.
## Sample 38 - 51892 reads in 23402 unique sequences.
## Sample 39 - 166303 reads in 66146 unique sequences.
## Sample 40 - 104965 reads in 41895 unique sequences.
## Sample 41 - 93473 reads in 38456 unique sequences.
## Sample 42 - 98228 reads in 43896 unique sequences.
## Sample 43 - 116561 reads in 51015 unique sequences.
## Sample 44 - 73117 reads in 32359 unique sequences.
## Sample 45 - 181339 reads in 70169 unique sequences.
## Sample 46 - 73644 reads in 35764 unique sequences.
## Sample 47 - 36722 reads in 19412 unique sequences.
## Sample 48 - 151263 reads in 60473 unique sequences.
## Sample 49 - 163537 reads in 65844 unique sequences.
## Sample 50 - 113857 reads in 50862 unique sequences.
## Sample 51 - 126809 reads in 51919 unique sequences.
## Sample 52 - 55466 reads in 24003 unique sequences.
## Sample 53 - 112939 reads in 47733 unique sequences.
## Sample 54 - 85701 reads in 40991 unique sequences.
## Sample 55 - 147059 reads in 62016 unique sequences.
## Sample 56 - 159243 reads in 60742 unique sequences.
## Sample 57 - 88440 reads in 39130 unique sequences.
## Sample 58 - 142925 reads in 62193 unique sequences.
## Sample 59 - 179396 reads in 78724 unique sequences.
## Sample 60 - 99779 reads in 49487 unique sequences.
## Sample 61 - 179152 reads in 75416 unique sequences.
## Sample 62 - 125691 reads in 57791 unique sequences.
## Sample 63 - 31462 reads in 18684 unique sequences.
## Sample 64 - 99430 reads in 46897 unique sequences.
## Sample 65 - 40464 reads in 21590 unique sequences.
## Sample 66 - 367545 reads in 127389 unique sequences.
## Sample 67 - 76115 reads in 41534 unique sequences.
## Sample 68 - 147529 reads in 61949 unique sequences.
## Sample 69 - 88641 reads in 41829 unique sequences.
## Sample 70 - 161364 reads in 65564 unique sequences.
## Sample 71 - 107560 reads in 43150 unique sequences.
## Sample 72 - 197848 reads in 79191 unique sequences.
## Sample 1 - 71046 reads in 42186 unique sequences.
## Sample 2 - 66972 reads in 37031 unique sequences.
## Sample 3 - 225428 reads in 111123 unique sequences.
## Sample 4 - 152947 reads in 76943 unique sequences.
## Sample 5 - 175990 reads in 85296 unique sequences.
## Sample 6 - 120852 reads in 61992 unique sequences.
## Sample 7 - 158938 reads in 89834 unique sequences.
## Sample 8 - 103350 reads in 60011 unique sequences.
## Sample 9 - 85364 reads in 49195 unique sequences.
## Sample 10 - 150286 reads in 80077 unique sequences.
## Sample 11 - 162646 reads in 85514 unique sequences.
## Sample 12 - 104183 reads in 53392 unique sequences.
## Sample 13 - 144019 reads in 73364 unique sequences.
## Sample 14 - 64744 reads in 35680 unique sequences.
## Sample 15 - 55649 reads in 31190 unique sequences.
## Sample 16 - 136356 reads in 72218 unique sequences.
## Sample 17 - 166271 reads in 84133 unique sequences.
## Sample 18 - 71408 reads in 39651 unique sequences.
## Sample 19 - 197771 reads in 89402 unique sequences.
## Sample 20 - 78170 reads in 43396 unique sequences.
## Sample 21 - 142121 reads in 70892 unique sequences.
## Sample 22 - 123771 reads in 66248 unique sequences.
## Sample 23 - 96872 reads in 58535 unique sequences.
## Sample 24 - 116031 reads in 59396 unique sequences.
## Sample 25 - 116883 reads in 63664 unique sequences.
## Sample 26 - 91426 reads in 52026 unique sequences.
## Sample 27 - 95720 reads in 51940 unique sequences.
## Sample 28 - 105899 reads in 57862 unique sequences.
## Sample 29 - 108872 reads in 61317 unique sequences.
## Sample 30 - 118042 reads in 64384 unique sequences.
## Sample 31 - 29016 reads in 18014 unique sequences.
## Sample 32 - 76786 reads in 46115 unique sequences.
## Sample 33 - 66922 reads in 41333 unique sequences.
## Sample 34 - 90321 reads in 51674 unique sequences.
## Sample 35 - 69390 reads in 38955 unique sequences.
## Sample 36 - 36454 reads in 24285 unique sequences.
## Sample 37 - 115333 reads in 63538 unique sequences.
## Sample 38 - 51892 reads in 29131 unique sequences.
## Sample 39 - 166303 reads in 84461 unique sequences.
## Sample 40 - 104965 reads in 55214 unique sequences.
## Sample 41 - 93473 reads in 50062 unique sequences.
## Sample 42 - 98228 reads in 54465 unique sequences.
## Sample 43 - 116561 reads in 64425 unique sequences.
## Sample 44 - 73117 reads in 42204 unique sequences.
## Sample 45 - 181339 reads in 91570 unique sequences.
## Sample 46 - 73644 reads in 42955 unique sequences.
## Sample 47 - 36722 reads in 23512 unique sequences.
## Sample 48 - 151263 reads in 77391 unique sequences.
## Sample 49 - 163537 reads in 84892 unique sequences.
## Sample 50 - 113857 reads in 63773 unique sequences.
## Sample 51 - 126809 reads in 65435 unique sequences.
## Sample 52 - 55466 reads in 29827 unique sequences.
## Sample 53 - 112939 reads in 58442 unique sequences.
## Sample 54 - 85701 reads in 49066 unique sequences.
## Sample 55 - 147059 reads in 80401 unique sequences.
## Sample 56 - 159243 reads in 79253 unique sequences.
## Sample 57 - 88440 reads in 48602 unique sequences.
## Sample 58 - 142925 reads in 77054 unique sequences.
## Sample 59 - 179396 reads in 102495 unique sequences.
## Sample 60 - 99779 reads in 59204 unique sequences.
## Sample 61 - 179152 reads in 95946 unique sequences.
## Sample 62 - 125691 reads in 69446 unique sequences.
## Sample 63 - 31462 reads in 21299 unique sequences.
## Sample 64 - 99430 reads in 58826 unique sequences.
## Sample 65 - 40464 reads in 25288 unique sequences.
## Sample 66 - 367545 reads in 174846 unique sequences.
## Sample 67 - 76115 reads in 48368 unique sequences.
## Sample 68 - 147529 reads in 76770 unique sequences.
## Sample 69 - 88641 reads in 53330 unique sequences.
## Sample 70 - 161364 reads in 84568 unique sequences.
## Sample 71 - 107560 reads in 57671 unique sequences.
## Sample 72 - 197848 reads in 104514 unique sequences.
merged_asvs <- mergePairs(dada_forward,
filtered_forward_reads_2,
dada_reverse,
filtered_reverse_reads_2,
verbose=TRUE)## 41654 paired-reads (in 1064 unique pairings) successfully merged out of 61969 (in 3320 pairings) input.
## 42911 paired-reads (in 1130 unique pairings) successfully merged out of 59874 (in 3169 pairings) input.
## 160215 paired-reads (in 4410 unique pairings) successfully merged out of 208220 (in 11870 pairings) input.
## 104690 paired-reads (in 2902 unique pairings) successfully merged out of 140473 (in 8082 pairings) input.
## 124216 paired-reads (in 2799 unique pairings) successfully merged out of 162124 (in 7913 pairings) input.
## 83127 paired-reads (in 2183 unique pairings) successfully merged out of 110011 (in 6324 pairings) input.
## 93671 paired-reads (in 2484 unique pairings) successfully merged out of 142303 (in 8798 pairings) input.
## 61322 paired-reads (in 1683 unique pairings) successfully merged out of 90966 (in 5621 pairings) input.
## 51272 paired-reads (in 1261 unique pairings) successfully merged out of 75737 (in 4605 pairings) input.
## 97420 paired-reads (in 2508 unique pairings) successfully merged out of 136343 (in 7558 pairings) input.
## 116327 paired-reads (in 3267 unique pairings) successfully merged out of 148822 (in 8131 pairings) input.
## 70300 paired-reads (in 1842 unique pairings) successfully merged out of 94525 (in 5290 pairings) input.
## 96080 paired-reads (in 2724 unique pairings) successfully merged out of 130650 (in 7751 pairings) input.
## 42238 paired-reads (in 1190 unique pairings) successfully merged out of 58089 (in 3160 pairings) input.
## 35789 paired-reads (in 1003 unique pairings) successfully merged out of 49335 (in 2829 pairings) input.
## 87449 paired-reads (in 2268 unique pairings) successfully merged out of 122741 (in 6705 pairings) input.
## 114512 paired-reads (in 3171 unique pairings) successfully merged out of 152575 (in 8701 pairings) input.
## 44949 paired-reads (in 1327 unique pairings) successfully merged out of 63777 (in 3536 pairings) input.
## 146446 paired-reads (in 3742 unique pairings) successfully merged out of 183991 (in 9397 pairings) input.
## 48818 paired-reads (in 1321 unique pairings) successfully merged out of 69150 (in 3762 pairings) input.
## 100162 paired-reads (in 2753 unique pairings) successfully merged out of 130463 (in 7250 pairings) input.
## 84398 paired-reads (in 2002 unique pairings) successfully merged out of 112702 (in 6223 pairings) input.
## 54705 paired-reads (in 1448 unique pairings) successfully merged out of 85189 (in 5116 pairings) input.
## 80168 paired-reads (in 2325 unique pairings) successfully merged out of 106309 (in 6020 pairings) input.
## 76434 paired-reads (in 1804 unique pairings) successfully merged out of 105279 (in 6275 pairings) input.
## 55690 paired-reads (in 1471 unique pairings) successfully merged out of 81014 (in 4570 pairings) input.
## 63180 paired-reads (in 1607 unique pairings) successfully merged out of 86947 (in 4296 pairings) input.
## 68970 paired-reads (in 1874 unique pairings) successfully merged out of 95318 (in 5296 pairings) input.
## 68855 paired-reads (in 1675 unique pairings) successfully merged out of 97539 (in 5450 pairings) input.
## 74198 paired-reads (in 2026 unique pairings) successfully merged out of 106394 (in 5981 pairings) input.
## 16780 paired-reads (in 499 unique pairings) successfully merged out of 25071 (in 1273 pairings) input.
## 46005 paired-reads (in 1092 unique pairings) successfully merged out of 67740 (in 3833 pairings) input.
## 35970 paired-reads (in 941 unique pairings) successfully merged out of 58134 (in 3290 pairings) input.
## 55555 paired-reads (in 1487 unique pairings) successfully merged out of 80523 (in 4338 pairings) input.
## 41521 paired-reads (in 883 unique pairings) successfully merged out of 61880 (in 3214 pairings) input.
## 18714 paired-reads (in 458 unique pairings) successfully merged out of 30403 (in 1584 pairings) input.
## 71226 paired-reads (in 1984 unique pairings) successfully merged out of 103509 (in 5978 pairings) input.
## 33777 paired-reads (in 887 unique pairings) successfully merged out of 46103 (in 2282 pairings) input.
## 112255 paired-reads (in 2733 unique pairings) successfully merged out of 152287 (in 8610 pairings) input.
## 69079 paired-reads (in 1954 unique pairings) successfully merged out of 94807 (in 5306 pairings) input.
## 61996 paired-reads (in 1729 unique pairings) successfully merged out of 83864 (in 4682 pairings) input.
## 62702 paired-reads (in 1545 unique pairings) successfully merged out of 87697 (in 4822 pairings) input.
## 73617 paired-reads (in 1979 unique pairings) successfully merged out of 104525 (in 5989 pairings) input.
## 46148 paired-reads (in 1279 unique pairings) successfully merged out of 64714 (in 3424 pairings) input.
## 123470 paired-reads (in 3013 unique pairings) successfully merged out of 166472 (in 9212 pairings) input.
## 44589 paired-reads (in 1159 unique pairings) successfully merged out of 64689 (in 3514 pairings) input.
## 21573 paired-reads (in 584 unique pairings) successfully merged out of 31588 (in 1669 pairings) input.
## 101688 paired-reads (in 2476 unique pairings) successfully merged out of 137564 (in 7443 pairings) input.
## 109388 paired-reads (in 2306 unique pairings) successfully merged out of 149734 (in 7508 pairings) input.
## 70934 paired-reads (in 1880 unique pairings) successfully merged out of 102161 (in 5659 pairings) input.
## 83246 paired-reads (in 2331 unique pairings) successfully merged out of 114961 (in 6645 pairings) input.
## 36675 paired-reads (in 990 unique pairings) successfully merged out of 49577 (in 2471 pairings) input.
## 74082 paired-reads (in 1868 unique pairings) successfully merged out of 101709 (in 5685 pairings) input.
## 51780 paired-reads (in 1289 unique pairings) successfully merged out of 75903 (in 4135 pairings) input.
## 94461 paired-reads (in 2433 unique pairings) successfully merged out of 132799 (in 7464 pairings) input.
## 108641 paired-reads (in 2473 unique pairings) successfully merged out of 145636 (in 7331 pairings) input.
## 55043 paired-reads (in 1396 unique pairings) successfully merged out of 78952 (in 4233 pairings) input.
## 91558 paired-reads (in 2112 unique pairings) successfully merged out of 129192 (in 7106 pairings) input.
## 113519 paired-reads (in 2941 unique pairings) successfully merged out of 161091 (in 9950 pairings) input.
## 56394 paired-reads (in 1478 unique pairings) successfully merged out of 87235 (in 5167 pairings) input.
## 114904 paired-reads (in 3105 unique pairings) successfully merged out of 161651 (in 9499 pairings) input.
## 76950 paired-reads (in 1689 unique pairings) successfully merged out of 111701 (in 6103 pairings) input.
## 15411 paired-reads (in 409 unique pairings) successfully merged out of 25816 (in 1339 pairings) input.
## 59333 paired-reads (in 1552 unique pairings) successfully merged out of 88291 (in 4842 pairings) input.
## 22405 paired-reads (in 620 unique pairings) successfully merged out of 33987 (in 1741 pairings) input.
## 260390 paired-reads (in 6228 unique pairings) successfully merged out of 341354 (in 18461 pairings) input.
## 41002 paired-reads (in 992 unique pairings) successfully merged out of 65539 (in 3548 pairings) input.
## 95910 paired-reads (in 2202 unique pairings) successfully merged out of 133931 (in 7309 pairings) input.
## 53436 paired-reads (in 1362 unique pairings) successfully merged out of 78012 (in 4149 pairings) input.
## 105352 paired-reads (in 2682 unique pairings) successfully merged out of 147213 (in 8727 pairings) input.
## 72001 paired-reads (in 1724 unique pairings) successfully merged out of 97976 (in 4864 pairings) input.
## 137546 paired-reads (in 2876 unique pairings) successfully merged out of 181542 (in 9791 pairings) input.
## [1] 72 47259
## [1] "matrix" "array"
##
## 262 263 264 265 266 267 268 269 270 271 272 273 274
## 23 108 75 289 54 223 121 131 1 1 35 50 34
## 275 276 277 278 279 280 281 282 284 285 286 287 288
## 56 162 41981 3166 122 113 15 24 2 3 8 4 1
## 289 290 294 297 298 299 300 301 302 304 305 306 307
## 7 1 1 7 2 64 28 1 4 6 1 1 2
## 308 309 311 312 316 319 320 326 372 381 382 384 386
## 5 44 196 1 1 1 1 2 1 20 6 4 14
## 387 408 409
## 34 1 1
data.frame(Seq_Length = nchar(getSequences(raw_asv_table))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram(binwidth=5,
breaks = seq(265, 280, by = 2)) +
labs(title = "Raw distribution of ASV length")raw_asv_table_trimmed <- raw_asv_table[,nchar(colnames(raw_asv_table)) %in% 260:290]
sum(raw_asv_table_trimmed)/sum(raw_asv_table)## [1] 0.9951039
data.frame(Seq_Length = nchar(getSequences(raw_asv_table_trimmed))) %>%
ggplot(aes(x = Seq_Length )) +
geom_histogram() +
labs(title = "Trimmed distribution of ASV length")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
nochimeras_asv_table <- removeBimeraDenovo(raw_asv_table_trimmed, method="consensus", multithread=TRUE,verbose=TRUE)## Identified 10417 bimeras out of 46810 input sequences.
## [1] 72 36393
## [1] 0.8592173
## [1] 0.8634448
getN <- function(x) sum(getUniques(x))
track <- cbind(filtered_reads_2, sapply(dada_forward, getN), sapply(dada_reverse, getN), sapply(merged_asvs, getN), rowSums(nochimeras_asv_table))
head(track)## reads.in reads.out
## SRR19625066_1.fastq.gz 77543 71046 64789 66410 41654 38278
## SRR19625067_1.fastq.gz 73819 66972 62259 63160 42911 37634
## SRR19625068_1.fastq.gz 248353 225428 214306 217535 160215 130894
## SRR19625069_1.fastq.gz 169479 152947 145208 146663 104690 83668
## SRR19625070_1.fastq.gz 191391 175990 166978 169110 124216 110464
## SRR19625071_1.fastq.gz 131759 120852 113897 115399 83127 71631
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")
rownames(track) <- samples
track_df <-
track %>%
as.data.frame() %>%
rownames_to_column(var="names") %>%
mutate(perc_reads_retained = 100 * nochim / input)
DT::datatable(track_df)track_df %>%
pivot_longer(input:nochim, names_to = "read_type", values_to = "num_reads") %>%
mutate(read_type = fct_relevel(read_type,
"input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")) %>%
ggplot(aes(x = read_type, y = num_reads, fill = read_type)) +
geom_line(aes(group = names), color = "grey") +
geom_point(shape = 21, size = 3, alpha = 0.8) +
scale_fill_brewer(palette = "Spectral") +
labs(x = "Filtering Step", y = "Number of Sequences") +
theme_bw()# Which sample had the lowest number of reads going into dada2?
lowest_reads <- track_df[order(track_df$input),]
head(lowest_reads)## names input filtered denoisedF denoisedR merged nochim
## 31 SRR19625096 31920 29016 26324 26749 16780 14978
## 63 SRR19625128 34359 31462 27406 28216 15411 14389
## 36 SRR19625101 39920 36454 32227 32971 18714 17207
## 47 SRR19625112 40693 36722 33225 33674 21573 19259
## 65 SRR19625130 44526 40464 36093 36717 22405 20233
## 38 SRR19625103 56813 51892 47912 48768 33777 30841
## perc_reads_retained
## 31 46.92356
## 63 41.87840
## 36 43.10371
## 47 47.32755
## 65 45.44087
## 38 54.28511
# Which sample had the most reads filtered out?
most_filtered <- track_df[order(track_df$perc_reads_retained),]
head(most_filtered)## names input filtered denoisedF denoisedR merged nochim
## 63 SRR19625128 34359 31462 27406 28216 15411 14389
## 36 SRR19625101 39920 36454 32227 32971 18714 17207
## 35 SRR19625100 75599 69390 64531 65191 41521 33114
## 33 SRR19625098 73272 66922 60993 62227 35970 32561
## 65 SRR19625130 44526 40464 36093 36717 22405 20233
## 67 SRR19625132 83941 76115 68869 70393 41002 38351
## perc_reads_retained
## 63 41.87840
## 36 43.10371
## 35 43.80217
## 33 44.43853
## 65 45.44087
## 67 45.68804
least_filtered <-
track_df[order(track_df$perc_reads_retained, decreasing=TRUE),]
head(least_filtered)## names input filtered denoisedF denoisedR merged nochim
## 5 SRR19625070 191391 175990 166978 169110 124216 110464
## 19 SRR19625084 215116 197771 189212 191141 146446 121550
## 49 SRR19625114 178406 163537 154348 156928 109388 100675
## 72 SRR19625137 217291 197848 187684 189461 137546 120542
## 56 SRR19625121 174534 159243 150262 152539 108641 95375
## 71 SRR19625136 120024 107560 101285 102745 72001 65457
## perc_reads_retained
## 5 57.71640
## 19 56.50440
## 49 56.43028
## 72 55.47492
## 56 54.64551
## 71 54.53659
## [1] 14389 205503
## [1] 60888
This data was extremely filtered throughout the whole trimming and error reduction process. I may repeat the analysis with no trimming step, although it may still filter out just as much!!
#Using silva database
taxa_db <- assignTaxonomy(nochimeras_asv_table, "/workdir/in_class_data/taxonomy/silva_nr99_v138.1_train_set.fa.gz")
#assigning species with silva database
taxa_species <- addSpecies(taxa_db, "/workdir/in_class_data/taxonomy/silva_species_assignment_v138.1.fa.gz")
taxa_print <- taxa_species
rownames(taxa_print) <- NULL
head(taxa_print)## Kingdom Phylum Class Order
## [1,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Corynebacteriales"
## [2,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Corynebacteriales"
## [3,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Corynebacteriales"
## [4,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Corynebacteriales"
## [5,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Corynebacteriales"
## [6,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Corynebacteriales"
## Family Genus Species
## [1,] "Mycobacteriaceae" "Mycobacterium" NA
## [2,] "Mycobacteriaceae" "Mycobacterium" NA
## [3,] "Mycobacteriaceae" "Mycobacterium" NA
## [4,] "Mycobacteriaceae" "Mycobacterium" NA
## [5,] "Mycobacteriaceae" "Mycobacterium" NA
## [6,] "Mycobacteriaceae" "Mycobacterium" NA
Output will include:
####ASV Export
#Pull ASV sequences
asv_seqs <- colnames(nochimeras_asv_table)
#Headers for asv seqs
asv_headers <- vector(dim(nochimeras_asv_table)[2], mode="character")
asv_headers[1:5]## [1] "" "" "" "" ""
#Fill vector with asv names
for (i in 1:dim(nochimeras_asv_table)[2]) {
asv_headers[i] <- paste("ASV", i, sep="_")
}
head(asv_headers)## [1] "ASV_1" "ASV_2" "ASV_3" "ASV_4" "ASV_5" "ASV_6"
###Rename ASVs and write as table
asv_table <- t(nochimeras_asv_table)
row.names(asv_table) <- sub(">","", asv_headers)
####Taxonomy table
tax_table <- taxa_species %>%
as.data.frame() %>%
rownames_to_column(var="asv_seqs")
head(tax_table)## asv_seqs
## 1 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCC
## 2 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCC
## 3 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCT
## 4 CCGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCC
## 5 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCG
## 6 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCT
## Kingdom Phylum Class Order Family
## 1 Bacteria Actinobacteriota Actinobacteria Corynebacteriales Mycobacteriaceae
## 2 Bacteria Actinobacteriota Actinobacteria Corynebacteriales Mycobacteriaceae
## 3 Bacteria Actinobacteriota Actinobacteria Corynebacteriales Mycobacteriaceae
## 4 Bacteria Actinobacteriota Actinobacteria Corynebacteriales Mycobacteriaceae
## 5 Bacteria Actinobacteriota Actinobacteria Corynebacteriales Mycobacteriaceae
## 6 Bacteria Actinobacteriota Actinobacteria Corynebacteriales Mycobacteriaceae
## Genus Species
## 1 Mycobacterium <NA>
## 2 Mycobacterium <NA>
## 3 Mycobacterium <NA>
## 4 Mycobacterium <NA>
## 5 Mycobacterium <NA>
## 6 Mycobacterium <NA>
## asv_seqs
## ASV_1 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCC
## ASV_2 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCC
## ASV_3 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCT
## ASV_4 CCGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCC
## ASV_5 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCG
## ASV_6 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCT
## Kingdom Phylum Class Order
## ASV_1 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_2 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_3 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_4 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_5 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_6 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## Family Genus Species
## ASV_1 Mycobacteriaceae Mycobacterium <NA>
## ASV_2 Mycobacteriaceae Mycobacterium <NA>
## ASV_3 Mycobacteriaceae Mycobacterium <NA>
## ASV_4 Mycobacteriaceae Mycobacterium <NA>
## ASV_5 Mycobacteriaceae Mycobacterium <NA>
## ASV_6 Mycobacteriaceae Mycobacterium <NA>
#New column with ASV names
asv_taxa <-
tax_table %>%
mutate(ASV=rownames(asv_table)) %>%
dplyr::select(Kingdom, Phylum, Class, Order, Family, Genus, Species, ASV, asv_seqs)
head(asv_taxa)## Kingdom Phylum Class Order
## ASV_1 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_2 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_3 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_4 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_5 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## ASV_6 Bacteria Actinobacteriota Actinobacteria Corynebacteriales
## Family Genus Species ASV
## ASV_1 Mycobacteriaceae Mycobacterium <NA> ASV_1
## ASV_2 Mycobacteriaceae Mycobacterium <NA> ASV_2
## ASV_3 Mycobacteriaceae Mycobacterium <NA> ASV_3
## ASV_4 Mycobacteriaceae Mycobacterium <NA> ASV_4
## ASV_5 Mycobacteriaceae Mycobacterium <NA> ASV_5
## ASV_6 Mycobacteriaceae Mycobacterium <NA> ASV_6
## asv_seqs
## ASV_1 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCC
## ASV_2 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCC
## ASV_3 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCT
## ASV_4 CCGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCC
## ASV_5 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCG
## ASV_6 CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCT
Files to be written:
#Count table with numbered ASV names
write.table(asv_table, "data/01_dada2/asv_counts.tsv", sep = "\t", quote = FALSE, col.names = NA)
#Table with ASV sequence names
write.table(nochimeras_asv_table, "data/01_dada2/asv_counts_with_seqnames.tsv", sep="\t", quote=FALSE,col.names=NA)
#Fasta file for reference later
asv_fasta <- c(rbind(asv_headers, asv_seqs))
head(asv_fasta)## [1] "ASV_1"
## [2] "CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCC"
## [3] "ASV_2"
## [4] "CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCC"
## [5] "ASV_3"
## [6] "CAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCGCGTTGTTCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGGCGATACGGGCAGACTGGAGTACTGCAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACGCTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGAAACCCT"
write(asv_fasta, "data/01_dada2/asvs.fasta")
#Save taxonomy tables
write.table(asv_taxa, "data/01_dada2/asv_taxonomy.tsv", sep="\t", quote=FALSE, col.names=NA)
# Save to an RData object
# No chimeras asv table
save(nochimeras_asv_table, file="data/01_dada2/nochimeras_asv_table.Rdata")
#asv counts
save(asv_table, file="data/01_dada2/asv_counts.RData")
#tracking counts dataframe
save(track_df, file="data/01_dada2/track_read_counts.Rdata")## [1] "2024-03-17 15:37:11 EDT"
## Time difference of 3.664 hours
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.3.2 (2023-10-31)
## os Rocky Linux 9.0 (Blue Onyx)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/New_York
## date 2024-03-17
## pandoc 3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [2] CRAN (R 4.3.2)
## ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.2)
## ape 5.7-1 2023-03-13 [2] CRAN (R 4.3.2)
## Biobase 2.62.0 2023-10-24 [2] Bioconductor
## BiocGenerics 0.48.1 2023-11-01 [2] Bioconductor
## BiocManager * 1.30.22 2023-08-08 [2] CRAN (R 4.3.2)
## BiocParallel 1.36.0 2023-10-24 [2] Bioconductor
## biomformat 1.30.0 2023-10-24 [1] Bioconductor
## Biostrings 2.70.1 2023-10-25 [2] Bioconductor
## bitops 1.0-7 2021-04-24 [2] CRAN (R 4.3.2)
## bslib 0.5.1 2023-08-11 [2] CRAN (R 4.3.2)
## cachem 1.0.8 2023-05-01 [2] CRAN (R 4.3.2)
## callr 3.7.3 2022-11-02 [2] CRAN (R 4.3.2)
## cli 3.6.1 2023-03-23 [2] CRAN (R 4.3.2)
## cluster 2.1.4 2022-08-22 [2] CRAN (R 4.3.2)
## codetools 0.2-19 2023-02-01 [2] CRAN (R 4.3.2)
## colorspace 2.1-0 2023-01-23 [2] CRAN (R 4.3.2)
## cowplot * 1.1.3 2024-01-22 [1] CRAN (R 4.3.2)
## crayon 1.5.2 2022-09-29 [2] CRAN (R 4.3.2)
## crosstalk 1.2.0 2021-11-04 [2] CRAN (R 4.3.2)
## dada2 * 1.30.0 2023-10-24 [1] Bioconductor
## data.table 1.14.8 2023-02-17 [2] CRAN (R 4.3.2)
## DelayedArray 0.28.0 2023-10-24 [2] Bioconductor
## deldir 1.0-9 2023-05-17 [2] CRAN (R 4.3.2)
## devtools * 2.4.4 2022-07-20 [2] CRAN (R 4.2.1)
## digest 0.6.33 2023-07-07 [2] CRAN (R 4.3.2)
## dplyr * 1.1.3 2023-09-03 [2] CRAN (R 4.3.2)
## DT * 0.32 2024-02-19 [1] CRAN (R 4.3.2)
## ellipsis 0.3.2 2021-04-29 [2] CRAN (R 4.3.2)
## evaluate 0.23 2023-11-01 [2] CRAN (R 4.3.2)
## fansi 1.0.5 2023-10-08 [2] CRAN (R 4.3.2)
## farver 2.1.1 2022-07-06 [2] CRAN (R 4.3.2)
## fastmap 1.1.1 2023-02-24 [2] CRAN (R 4.3.2)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.2)
## foreach 1.5.2 2022-02-02 [2] CRAN (R 4.3.2)
## fs 1.6.3 2023-07-20 [2] CRAN (R 4.3.2)
## generics 0.1.3 2022-07-05 [2] CRAN (R 4.3.2)
## GenomeInfoDb 1.38.0 2023-10-24 [2] Bioconductor
## GenomeInfoDbData 1.2.11 2023-11-07 [2] Bioconductor
## GenomicAlignments 1.38.0 2023-10-24 [2] Bioconductor
## GenomicRanges 1.54.1 2023-10-29 [2] Bioconductor
## ggplot2 * 3.5.0 2024-02-23 [1] CRAN (R 4.3.2)
## glue 1.6.2 2022-02-24 [2] CRAN (R 4.3.2)
## gtable 0.3.4 2023-08-21 [2] CRAN (R 4.3.2)
## highr 0.10 2022-12-22 [2] CRAN (R 4.3.2)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.2)
## htmltools 0.5.7 2023-11-03 [2] CRAN (R 4.3.2)
## htmlwidgets 1.6.2 2023-03-17 [2] CRAN (R 4.3.2)
## httpuv 1.6.12 2023-10-23 [2] CRAN (R 4.3.2)
## hwriter 1.3.2.1 2022-04-08 [1] CRAN (R 4.3.2)
## igraph 1.5.1 2023-08-10 [2] CRAN (R 4.3.2)
## iNEXT * 3.0.0 2022-08-29 [1] CRAN (R 4.3.2)
## interp 1.1-6 2024-01-26 [1] CRAN (R 4.3.2)
## IRanges 2.36.0 2023-10-24 [2] Bioconductor
## iterators 1.0.14 2022-02-05 [2] CRAN (R 4.3.2)
## jpeg 0.1-10 2022-11-29 [1] CRAN (R 4.3.2)
## jquerylib 0.1.4 2021-04-26 [2] CRAN (R 4.3.2)
## jsonlite 1.8.7 2023-06-29 [2] CRAN (R 4.3.2)
## knitr 1.45 2023-10-30 [2] CRAN (R 4.3.2)
## labeling 0.4.3 2023-08-29 [2] CRAN (R 4.3.2)
## later 1.3.1 2023-05-02 [2] CRAN (R 4.3.2)
## lattice * 0.21-9 2023-10-01 [2] CRAN (R 4.3.2)
## latticeExtra 0.6-30 2022-07-04 [1] CRAN (R 4.3.2)
## lifecycle 1.0.3 2022-10-07 [2] CRAN (R 4.3.2)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.2)
## magrittr 2.0.3 2022-03-30 [2] CRAN (R 4.3.2)
## MASS 7.3-60 2023-05-04 [2] CRAN (R 4.3.2)
## Matrix 1.6-1.1 2023-09-18 [2] CRAN (R 4.3.2)
## MatrixGenerics 1.14.0 2023-10-24 [2] Bioconductor
## matrixStats 1.1.0 2023-11-07 [2] CRAN (R 4.3.2)
## memoise 2.0.1 2021-11-26 [2] CRAN (R 4.3.2)
## mgcv 1.9-0 2023-07-11 [2] CRAN (R 4.3.2)
## mime 0.12 2021-09-28 [2] CRAN (R 4.3.2)
## miniUI 0.1.1.1 2018-05-18 [2] CRAN (R 4.3.2)
## multtest 2.58.0 2023-10-24 [1] Bioconductor
## munsell 0.5.0 2018-06-12 [2] CRAN (R 4.3.2)
## nlme 3.1-163 2023-08-09 [2] CRAN (R 4.3.2)
## pacman * 0.5.1 2019-03-11 [1] CRAN (R 4.3.2)
## patchwork * 1.2.0.9000 2024-03-12 [1] Github (thomasp85/patchwork@d943757)
## permute * 0.9-7 2022-01-27 [1] CRAN (R 4.3.2)
## phyloseq * 1.46.0 2023-10-24 [1] Bioconductor
## pillar 1.9.0 2023-03-22 [2] CRAN (R 4.3.2)
## pkgbuild 1.4.2 2023-06-26 [2] CRAN (R 4.3.2)
## pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.3.2)
## pkgload 1.3.3 2023-09-22 [2] CRAN (R 4.3.2)
## plyr 1.8.9 2023-10-02 [2] CRAN (R 4.3.2)
## png 0.1-8 2022-11-29 [2] CRAN (R 4.3.2)
## prettyunits 1.2.0 2023-09-24 [2] CRAN (R 4.3.2)
## processx 3.8.2 2023-06-30 [2] CRAN (R 4.3.2)
## profvis 0.3.8 2023-05-02 [2] CRAN (R 4.3.2)
## promises 1.2.1 2023-08-10 [2] CRAN (R 4.3.2)
## ps 1.7.5 2023-04-18 [2] CRAN (R 4.3.2)
## purrr * 1.0.2 2023-08-10 [2] CRAN (R 4.3.2)
## R6 2.5.1 2021-08-19 [2] CRAN (R 4.3.2)
## RColorBrewer 1.1-3 2022-04-03 [2] CRAN (R 4.3.2)
## Rcpp * 1.0.11 2023-07-06 [2] CRAN (R 4.3.2)
## RcppParallel 5.1.7 2023-02-27 [2] CRAN (R 4.3.2)
## RCurl 1.98-1.13 2023-11-02 [2] CRAN (R 4.3.2)
## readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.2)
## remotes 2.4.2.1 2023-07-18 [2] CRAN (R 4.3.2)
## reshape2 1.4.4 2020-04-09 [2] CRAN (R 4.3.2)
## rhdf5 2.46.1 2023-11-29 [1] Bioconductor 3.18 (R 4.3.2)
## rhdf5filters 1.14.1 2023-11-06 [1] Bioconductor
## Rhdf5lib 1.24.2 2024-02-07 [1] Bioconductor 3.18 (R 4.3.2)
## rlang 1.1.2 2023-11-04 [2] CRAN (R 4.3.2)
## rmarkdown 2.25 2023-09-18 [2] CRAN (R 4.3.2)
## Rsamtools 2.18.0 2023-10-24 [2] Bioconductor
## rstudioapi 0.15.0 2023-07-07 [2] CRAN (R 4.3.2)
## S4Arrays 1.2.0 2023-10-24 [2] Bioconductor
## S4Vectors 0.40.1 2023-10-26 [2] Bioconductor
## sass 0.4.7 2023-07-15 [2] CRAN (R 4.3.2)
## scales 1.3.0 2023-11-28 [2] CRAN (R 4.3.2)
## sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.3.2)
## shiny 1.7.5.1 2023-10-14 [2] CRAN (R 4.3.2)
## ShortRead 1.60.0 2023-10-24 [1] Bioconductor
## SparseArray 1.2.1 2023-11-05 [2] Bioconductor
## stringi 1.7.12 2023-01-11 [2] CRAN (R 4.3.2)
## stringr * 1.5.0 2022-12-02 [2] CRAN (R 4.3.2)
## SummarizedExperiment 1.32.0 2023-10-24 [2] Bioconductor
## survival 3.5-7 2023-08-14 [2] CRAN (R 4.3.2)
## tibble * 3.2.1 2023-03-20 [2] CRAN (R 4.3.2)
## tidyr * 1.3.0 2023-01-24 [2] CRAN (R 4.3.2)
## tidyselect 1.2.0 2022-10-10 [2] CRAN (R 4.3.2)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.2)
## timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.2)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.2)
## urlchecker 1.0.1 2021-11-30 [2] CRAN (R 4.3.2)
## usethis * 2.2.2 2023-07-06 [2] CRAN (R 4.3.2)
## utf8 1.2.4 2023-10-22 [2] CRAN (R 4.3.2)
## vctrs 0.6.4 2023-10-12 [2] CRAN (R 4.3.2)
## vegan * 2.6-4 2022-10-11 [1] CRAN (R 4.3.2)
## withr 2.5.2 2023-10-30 [2] CRAN (R 4.3.2)
## xfun 0.41 2023-11-01 [2] CRAN (R 4.3.2)
## xtable 1.8-4 2019-04-21 [2] CRAN (R 4.3.2)
## XVector 0.42.0 2023-10-24 [2] Bioconductor
## yaml 2.3.7 2023-01-23 [2] CRAN (R 4.3.2)
## zlibbioc 1.48.0 2023-10-24 [2] Bioconductor
##
## [1] /home/mld253/R/x86_64-pc-linux-gnu-library/4.3
## [2] /programs/R-4.3.2/library
##
## ──────────────────────────────────────────────────────────────────────────────